Genome-resolved metagenomics on anaerobic enrichments from deep subsurface groundwaters

Author

george.westmeijer@umu.se

Published

April 2, 2025

Read data and assign colors

The 16S ribosomal RNA data is from several sequencing projects. In chronological order, P15009 contains the groundwater samples from t0 (sampled Oct 2019), while P21015 contains the samples from t24 (sampled Feb 2020) from identical groundwaters. P21015 contains the sequencing libraries from the cultures as part of phase a (one meteoric groundwater), while P26201 contains the libraries from the cultures as part of phase b (three groundwaters as inoculum).

All 16S libraries were processed with the bioinformatic pipeline nf-core/ampliseq (v2.12.0), using the SBDI-GTDB (R09-RS220-1) as a reference.

Expand code
seqtab <- read_tsv("data/ASV_table.tsv.gz", col_types = cols(.default = col_character(), 
                                    count = col_integer()
                                    )) %>%
  group_by(sample) %>% mutate(relab = count / sum(count)) %>% ungroup()

tax <- read_tsv("data/ASV_tax.tsv.gz", col_types = cols(.default = col_character()))
               
smd <- read_tsv("data/smd.tsv", col_types = cols(.default = col_character())) 

This project includes 15 metagenomes from 15 unique cultures. The metagenomes contain 35 de-replicated metagenome-assembled genomes (MAG) that are affiliated with the Bacillota (7), Desulfobacterota (7), Pseudomonadota (6), Bacteroidota (4), Patescibacteria (3), Nanoarchaeota (2), Acidobacteriota (1), Campylobacterota (1), Chloroflexota (1), Iainarchaeota (1), Spirochaetota (1), and Verrucomicrobiota (1).

Figure 1: sampling-site
Expand code
coverm <- read_tsv("data/coverm.tsv.gz", col_types = cols(.default = col_character(), tpm = col_double()))

emapper <- read_tsv("data/emapper.tsv.gz", show_col_types = FALSE)

# Sample 35 (l7f) was removed from the binning process as this sample did not produce any bins
bintax <- read_tsv("data/gtdbtk.summary.tsv", col_types = cols(.default = col_character())) 

quality <- read_tsv("data/quality_report.tsv", show_col_types = F) %>%
  rename_with(tolower) %>%
  rename(genome = name) %>%
  filter(genome %in% coverm$genome)
Expand code
cols.gw <- c(
  "KR0015" = "#8D8680",
  "SA1420" = "#A2A475",
  "SA2600" = "#C7B19C"
  )

cols.origin <- c(
  "meteoric" = "#8D8680",
  "marine" = "#A2A475",
  "saline" = "#C7B19C"
  )

cols.fraction <- c(
  "environmental" = "#8D8680",
  "fractionated" = "#A2A475",
  "non-fractionated" = "#C7B19C"
  )

cols.mag <- c(
  "Other" = "#899DA4",
  "Nanoarchaeota" = "#046C9A",
  "Patescibacteria" = "#C7B19C",
  "Pseudomonadota" = "#972D15",
  "Desulfobacterota" = "#A2A475",
  "Bacillota" = "#1B5331"
  )

cols.phylum <- c( # Sorted on abundance
  "Patescibacteria" = "#C7B19C",
  "Desulfobacterota" = "#A2A475",
  "Chloroflexota" = "#D69C4E",
  "Atribacterota" = "#972D15",
  "Acidobacteriota" = "#FAEFD1",
  "Omnitrophota" = "#00A08A",
  "Nitrospirota" = "#D8B70A",
  "Campylobacterota" = "#1B5331",
  "Other" = "#899DA4",
  "Unidentified" = "#046C9A"
)

16S data groundwaters

Rarefaction 16S data

The 16S data from the groundwater samples are rarefied to check if there is sufficient sequencing depth to estimate microbial diversity.

Expand code
t0 <- c(
  "P15009_1057","P15009_1059","P15009_1060","P21015_1020","P21015_1021","P21015_1022", # KR0015
  "P15009_1061","P15009_1065","P15009_1066","P21015_1023","P21015_1024","P21015_1025", # SA1420
  "P15009_1069","P15009_1062","P15009_1063","P21015_1029","P21015_1030","P21015_1031"  # SA2600
       )

# Perform the subsampling
rare <- seqtab %>%
  # Remove negative control
  filter(sample %in% t0) %>%
  select(-relab) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  column_to_rownames("sample") %>%
  vegan::rarecurve(sample = 10000, step = 100, tidy = TRUE) %>%
  tibble() %>%
  rename_with(tolower)
Expand code
rare %>%
  inner_join(smd, by = c("site" = "sample")) %>%
  # One sample has ~ 1 million reads 
  filter(sample < 150000) %>%
  ggplot(aes(sample, species, group = site, colour = origin)) + 
  geom_line(linewidth = 0.75) +
  scale_color_manual(values = cols.origin, name = "Groundwater", labels = c("Meteoric", "Marine", "Saline")) +
  labs(x = "No. of sequenced reads", y = "Rarefied No. of ASVs") +
  scale_x_continuous(labels = c("0", "50.000", "100.000", "150.000")) +
  theme_tidy() +
  theme(
    panel.border = element_rect(fill = "transparent", linewidth = 0.75, colour = "#000000"),

  )
Figure 2: Rarefaction curves for the groundwater samples (t0) for each groundwater type (n = 6).
Expand code
ggsave(filename = "figures/rarecurve.pdf", width = 100, height = 80, units = "mm")

Overview community structure t0

To visualize the community composition, the most abundant phyla are identified while grouping less abundant phyla as “Other”. ASVs that are not identified on phylum level are included as “Unidentified”.

Expand code
t <- seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(8, mean_relab)

t %>% arrange(desc(mean_relab)) %>% knitr::kable()
phylum mean_relab
Patescibacteria 7.1312325
Desulfobacterota 2.7429408
Chloroflexota 1.2292858
Atribacterota 1.2125307
Acidobacteriota 0.9182271
Omnitrophota 0.7237157
Nitrospirota 0.3144247
Campylobacterota 0.2617040
Expand code
taxref <- tax %>%
  left_join(t %>% transmute(phylum, topphylum = phylum), by = "phylum") %>%
  mutate(topphylum = if_else(is.na(phylum), "Unidentified", topphylum)) %>%
  replace_na(list("topphylum" = "Other"))
Expand code
p3 <- seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(taxref, by = "seqid") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, topphylum) %>% 
  summarise(mrelab = sum(relab), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = fct_relevel(sample, t0), 
             y = mrelab * 100, 
             fill = fct_relevel(topphylum, names(cols.phylum) %>% rev())
             )
         ) +
  labs(x = '', y = 'Relative abundance (%)', fill = "") +
  geom_col() + scale_y_continuous(expand = c(0.01,0), limits = c(-10,101)) +
  annotate(geom = "segment", x =  0.55, xend =  3.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  3.55, xend =  6.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  6.55, xend =  9.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  9.55, xend = 12.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 12.55, xend = 15.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 15.55, xend = 18.45, y = -1, yend = -1, linewidth = 0.5) +
  # Labels dates
  annotate(geom = "text", x =  2, y = -3.5, label = "Oct 2019", size = 7/.pt) +
  annotate(geom = "text", x =  5, y = -3.5, label = "April 2020", size = 7/.pt) +
  annotate(geom = "text", x =  8, y = -3.5, label = "Oct 2019", size = 7/.pt) +
  annotate(geom = "text", x = 11, y = -3.5, label = "April 2020", size = 7/.pt) +
  annotate(geom = "text", x = 14, y = -3.5, label = "Oct 2019", size = 7/.pt) +
  annotate(geom = "text", x = 17, y = -3.5, label = "April 2020", size = 7/.pt) +
  # Labels groundwater
  annotate(geom = "text", x = 3.5, y = -8, label = "Meteoric", size = 7/.pt) +
  annotate(geom = "text", x = 9.5, y = -8, label = "Marine", size = 7/.pt) +
  annotate(geom = "text", x =15.5, y = -8, label = "Saline", size = 7/.pt) +
  scale_fill_manual(values = cols.phylum) + 
  guides(fill = guide_legend(ncol = 3, reverse = TRUE)) +
  theme_tidy(legend = "bottom") + 
  theme(
    aspect.ratio = 0.75,
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(),
    legend.key.height = unit(3.5, "mm"),
    legend.margin = margin(t = -20),
    legend.text = element_text(margin = margin(l = 2)),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.key.spacing.y = unit(0, "mm")
    )

p3
Figure 3: Community composition of the groundwater samples (t0) for each groundwater type (n = 6) before (October 2019) and after (April 2020) collection of groundwater for inoculation. The eight most abundant phyla are shown, grouping less abundant phyla as “Other” and uncharacterized ASVs as “Unidentified”.

Redundancy analysis

Expand code
site <- magick::image_read("figures/äspö-mod.png")

p1 <- ggplot() + 
  annotation_custom(grid::rasterGrob(site)) + 
  theme_minimal() + coord_fixed() +
  theme(
    plot.margin = margin(), 
    aspect.ratio = 0.75
    )

As the hydrochemistry data is from a monitoring program, the sampling data closest to groundwater sampling was selected. All units are mg l-1, except for EC (mS m-1), pH, T (degrees C), and O18 (ppt SMOW)

Expand code
chem <- read_tsv("data/sicada.txt", show_col_types = FALSE) %>% 
  rename_with(tolower) %>% 
  filter(sampling) %>%
  select(groundwater = idcode, so4, feii, ph_lab, ec_lab, temp_w, doc, nh4_n, o18, s34_so4) %>%
  inner_join(smd[,c("sample","groundwater")], by = "groundwater")

The number of ASVs per sample are estimated with the specnumber() function from the Vegan library.

Expand code
adiv <- seqtab %>%
  select(-relab) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  column_to_rownames('sample') %>%
  vegan::specnumber() %>% as.data.frame() %>%
  rownames_to_column('sample') %>% #filter(sample %in% t0) %>%
  rename(specnumber = 2)

chem <- adiv %>% # Add the species richness for each sample
  inner_join(chem, by = "sample") 

In order to run the RDA, the absolute counts are standardized using a Hellinger transformation (square root of the relative abundance).

Expand code
hellinger <- seqtab %>%
  filter(sample %in% t0) %>%
  # Standard Vegan transformation: Turn table with with samples as *rows*
  select(sample, seqid, count) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  # Turn into a numeric matrix
  column_to_rownames('sample') %>% vegan::decostand(method = "hellinger") %>%
  rownames_to_column("sample") %>%
  pivot_longer(-1, names_to = "seqid", values_to = "hellinger")

m <- hellinger %>% # Create a matrix
  spread(seqid, hellinger) %>%
  column_to_rownames("sample")

The RDA is called while providing a large number of constraining factors, as the RDA function will by default not include redundant variables (Pearson r > 0.9).

Expand code
set.seed(999)

vegan::rda(
  m ~ o18 + doc + ph_lab + ec_lab + temp_w + nh4_n + feii, 
  correlation = T,
  data = chem %>%
    semi_join(chem, by = 'sample') %>%
    filter(sample %in% t0) %>%
    arrange(sample) %>%
    column_to_rownames('sample') 
) -> rda

Some constraints or conditions were aliased because they were redundant. This
can happen if terms are linearly dependent (collinear): 'ph_lab', 'ec_lab',
'temp_w', 'nh4_n', 'feii'
Expand code
# This one will contain the proportions explained which we get by dividing the
# eigenvalue of each component with the sum of eigenvalues for all components.
p <- c(rda$CCA$eig, rda$CA$eig)/sum(c(rda$CCA$eig, rda$CA$eig))
# This one is collecting the coordinates for arrows that will depict how the 
# factors in our model point in the coordinate
a <- rda$CCA$biplot %>% data.frame() %>% tibble::rownames_to_column('factor')

lab <- tibble(factor = c("o18", "doc"),
            label = c("delta^18*O", "DOC")
) %>% inner_join(a, by = "factor")

summary(rda)

Call:
rda(formula = m ~ o18 + doc + ph_lab + ec_lab + temp_w + nh4_n +      feii, data = chem %>% semi_join(chem, by = "sample") %>%      filter(sample %in% t0) %>% arrange(sample) %>% column_to_rownames("sample"),      correlation = T) 

Partitioning of variance:
              Inertia Proportion
Total          0.6761     1.0000
Constrained    0.3871     0.5725
Unconstrained  0.2890     0.4275

Eigenvalues, and their contribution to the variance 

Importance of components:
                        RDA1   RDA2     PC1     PC2     PC3     PC4     PC5
Eigenvalue            0.2338 0.1533 0.05443 0.04926 0.04333 0.02922 0.02273
Proportion Explained  0.3458 0.2267 0.08051 0.07286 0.06409 0.04322 0.03363
Cumulative Proportion 0.3458 0.5725 0.65305 0.72591 0.79001 0.83323 0.86686
                          PC6     PC7     PC8    PC9     PC10     PC11     PC12
Eigenvalue            0.01598 0.01338 0.01224 0.0098 0.008638 0.007691 0.007139
Proportion Explained  0.02363 0.01979 0.01810 0.0145 0.012776 0.011376 0.010559
Cumulative Proportion 0.89049 0.91028 0.92837 0.9429 0.955646 0.967022 0.977581
                          PC13     PC14     PC15
Eigenvalue            0.006254 0.004717 0.004186
Proportion Explained  0.009250 0.006977 0.006192
Cumulative Proportion 0.986831 0.993808 1.000000

Accumulated constrained eigenvalues
Importance of components:
                        RDA1   RDA2
Eigenvalue            0.2338 0.1533
Proportion Explained  0.6040 0.3960
Cumulative Proportion 0.6040 1.0000
Expand code
vegan::RsquareAdj(rda)
$r.squared
[1] 0.5725442

$adj.r.squared
[1] 0.5155501
Expand code
p2 <- rda$CCA$wa %>% data.frame() %>% tibble::rownames_to_column('sample') %>%
  inner_join(chem, by = 'sample') %>%
  ggplot(aes(x = RDA1, y = RDA2)) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dotted") +
  geom_hline(yintercept = 0, colour = "grey", linetype = "dotted") +
  geom_point(aes(colour = groundwater, size = specnumber), shape = 1, stroke = 0.5) +
  xlab(sprintf("RDA1 (%2.1f%% explained)", p[[1]] * 100)) +
  ylab(sprintf("RDA2 (%2.1f%% explained)", p[[2]] * 100)) +
  geom_segment(
    data = a, mapping = aes(xend = RDA1/5, yend = RDA2/5), linewidth = 0.3,
    x = 0, y = 0, arrow = arrow(length = unit(1.5, "mm"), type = "closed") 
    ) +
  geom_label(
    data = lab, mapping = aes(x = RDA1/10, y = RDA2/10, label = label),
    size = 7/.pt, parse = TRUE, label.size = 0, label.padding = unit(0.1, "lines")
    ) + 
  scale_size(range = c(1,8), name = "No. of ASVs") +
  annotate("text", x = -0.07, y =-0.3, label = "Meteoric", colour = cols.gw[1], size = 7/.pt) +
  annotate("text", x = 0.06, y = 0.28, label = "Marine", colour = cols.gw[2], size = 7/.pt) +
  annotate("text", x =  0.2, y = -0.05, label = "Saline", colour = cols.gw[3], size = 7/.pt) +
  stat_ellipse(aes(x = RDA1, y = RDA2, colour = groundwater), 
               geom = "polygon", show.legend = F,
               fill = NA, linetype = "dashed") +
  # Add R-squared
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2, \" = 0.52\")", 
           size = 7/.pt, vjust = -0.5, hjust = 1.05, parse = TRUE) +
  scale_color_manual(name = "",values = cols.gw, guide = "none") +
  scale_fill_manual(name = "",values = cols.gw, guide = "none") +
  theme_tidy() +
  theme(
    legend.background = element_rect(fill = NA),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.text = element_text(margin = margin(r = 12, unit = "pt")),
    legend.position = "inside",
    legend.position.inside  = c(0.9,0.85)
    )
Expand code
library(patchwork)

p2 + p3 & theme(plot.margin = margin(l = 4))

Expand code
ggsave("figures/fig-1.pdf", width = 18, height = 10, units = "cm")

Network analysis: phylum

Expand code
t <- c(
  "Spirochaetota", "Acidobacteriota", "Atribacterota", "Chloroflexota",
  "Omnitrophota", "Nanoarchaeota", "Campylobacterota", "Pseudomonadota"
)

network.survey <- read_tsv("data/network-survey.tsv", show_col_types = F)

# Correlation between the abundance of the phyla using the cultures
fit <- tibble("phylum" = t, "r.squared" = NA)
for (i in t) {
  model <- lm(relab ~ cpr, data = network.survey %>% filter(phylum == i)) %>% summary()
  fit[fit$phylum == i, "r.squared"] <- model$adj.r.squared %>% round(digits = 2)
}

fit
# A tibble: 8 × 2
  phylum           r.squared
  <chr>                <dbl>
1 Spirochaetota         0.57
2 Acidobacteriota       0.55
3 Atribacterota         0.51
4 Chloroflexota         0.14
5 Omnitrophota          0.14
6 Nanoarchaeota         0.08
7 Campylobacterota      0.05
8 Pseudomonadota       -0.01
Expand code
p1 <- network.survey %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = fit %>% mutate(phylum = factor(phylum, levels = phylum)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}:", r.squared)), 
            size = 6/.pt, hjust = 1, vjust = 0, parse = TRUE, label.size = 0, fill = "white") +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "Sqrt relative abundance Patescibacteria (%)", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~phylum, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
t <- c(
  "Omnitrophota", "Chloroflexota", "Pseudomonadota", "Actinomycetota",
  "Verrucomicrobiota", "Acidobacteriota", "Bacillota", "Spirochaetota"
)

network <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% t) %>%
  group_by(sample, phylum) %>% summarise(relab = sum(relab), .groups = "drop") 

network <- seqtab %>% 
  inner_join(tax, by = "seqid") %>% filter(phylum == "Patescibacteria") %>%
  group_by(sample, phylum) %>% summarise(cpr = sum(relab), .groups = "drop") %>% select(-phylum) %>%
  inner_join(network, by = "sample")

# Correlation between the abundance of the phyla using the cultures
fit <- tibble("phylum" = t, "r.squared" = NA)
for (i in t) {
  model <- lm(relab ~ cpr, data = network %>% filter(phylum == i)) %>% summary()
  fit[fit$phylum == i, "r.squared"] <- model$adj.r.squared %>% round(digits = 2)
}

fit <- fit %>% arrange(desc(r.squared))
fit
# A tibble: 8 × 2
  phylum            r.squared
  <chr>                 <dbl>
1 Omnitrophota           0.78
2 Chloroflexota          0.72
3 Pseudomonadota         0.03
4 Actinomycetota         0.01
5 Bacillota              0   
6 Verrucomicrobiota     -0.01
7 Spirochaetota         -0.01
8 Acidobacteriota       -0.02
Expand code
n <- network %>%
  group_by(phylum) %>% summarise(no = n(), .groups = "drop") %>%
  inner_join(fit, by = "phylum") %>% mutate(no = as.character(no)) %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>% 
  arrange(phylum)
  
p2 <- network %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = n, 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared, " (", no, ")", sep = "")), 
            size = 6/.pt, hjust = 1.0, vjust = -0.0, parse = TRUE, label.size = 0, fill = "white") +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "Sqrt relative abundance Patescibacteria (%)", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~phylum, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
library(patchwork)
p1 / p2 + plot_annotation(tag_levels = c("a", "b")) & theme(
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  axis.ticks.length = unit(0.5, "mm"),
  plot.margin = margin(t = 4),
  plot.tag = element_text(),
  plot.tag.position = c(0.035, 0.96),
  strip.background = element_blank(),
  strip.text = element_text(margin = margin(b = 2))
)

Expand code
ggsave("figures/fig-6.pdf", width = 160, height = 160, units = "mm")

Network analysis: order

Expand code
t <- c(
  "Spirochaetota", "Acidobacteriota", "Atribacterota", "Chloroflexota",
  "Omnitrophota", "Nanoarchaeota", "Campylobacterota", "Pseudomonadota"
)

# The file survey-amplicons.tsv contains the ASVs from Äspö groundwaters published in https://doi.org/10.3389/fmicb.2018.02880
survey.amplicons <- read_tsv("data/survey-amplicons.tsv.gz", show_col_types = FALSE) %>%
  group_by(sample) %>% mutate(relab = count / sum(count)) %>% ungroup()

cpr <- survey.amplicons %>%
  filter(phylum == "Patescibacteria") %>%
  group_by(sample, order) %>% summarise(cpr = sum(relab), .groups = "drop") 

network.survey <- survey.amplicons %>%
  filter(phylum %in% t) %>%
  group_by(sample, order) %>% summarise(relab = sum(relab), .groups = "drop") %>% 
  # Add colum with abundance of CPR
  left_join(cpr, by = "sample", relationship = "many-to-many") %>%
  rename(order.clade = order.x, order.cpr = order.y)

# The filter > 19 implies that the two orders have to co-occur at least ten times
fit <- network.survey %>%
  group_by(order.clade, order.cpr) %>% summarize(n = n(), .groups = "drop_last") %>% 
  filter(n > 19) %>% ungroup() %>% na.omit()

for (i in 1:nrow(fit)) {
  data = network.survey %>% filter(order.clade == fit[i,]$order.clade & order.cpr == fit[i,]$order.cpr)
  model <- lm(relab ~ cpr, data = data)
  fit[i, "r.squared"] <- summary(model)$r.squared
}

fit <- fit %>% slice_max(r.squared, n = 8) %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  mutate(r.squared = signif(.$r.squared, digits = 2)) %>%
  arrange(desc(r.squared)) %>%
  mutate(n = as.character(n))
Expand code
label <- c(
  "Limnocylindrales UBA9983" = "Limnocylindrales\n-\nUBA9983",
  "Limnocylindrales UBA12405" = "Limnocylindrales\n-\nUBA12405",
  "Limnocylindrales JAHJFT01" = "Limnocylindrales\n-\nJAHJFT01",
  "SCGC-AAA011-G17 UBA2169" = "SCGC-AAA011-G17\n-\nUBA2169",
  "Limnocylindrales JAHISW01" = "Limnocylindrales\n-\nJAHISW01",
  "UBA2200 JAHJFT01" = "UBA2200\n-\nJAHJFT01",
  "Limnocylindrales UBA1369" = "Limnocylindrales\n-\nUBA1369",
  "Limnocylindrales Paceibacterales" = "Limnocylindrales\n-\nPaceibacterales"
  )

p1 <- network.survey %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  filter(clade %in% fit$clade) %>%
  mutate(clade = factor(clade, levels = fit$clade)) %>% 
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = fit %>% mutate(clade = factor(clade, levels = clade)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared, " (", n, ")", sep = "")), 
            size = 6/.pt, hjust = 1.0, vjust = -0.0, parse = TRUE, label.size = 0, fill = "white") +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "Sqrt relative abundance Patescibacteria (%)", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~clade, scales = "free", ncol = 4, labeller = as_labeller(label)) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
t <- c(
  "Omnitrophota", "Chloroflexota", "Pseudomonadota", "Actinobacteriota",
  "Verrucomicrobiota", "Acidobacteriota", "Bacillota", "Spirochaetota"
)

cpr <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum == "Patescibacteria") %>%
  group_by(sample, order) %>% summarise(cpr = sum(relab), .groups = "drop") 

network <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% t) %>%
  group_by(sample, order) %>% summarise(relab = sum(relab), .groups = "drop") %>% 
  # Add colum with abundance of CPR
  left_join(cpr, by = "sample", relationship = "many-to-many") %>%
  rename(order.clade = order.x, order.cpr = order.y) 

# The filter > 19 implies that the two orders have to co-occur at least ten times
fit <- network %>%
  group_by(order.clade, order.cpr) %>% summarize(n = n(), .groups = "drop_last") %>% 
  filter(n > 19) %>% ungroup() %>% na.omit()

for (i in 1:nrow(fit)) {
  data = network %>% filter(order.clade == fit[i,]$order.clade & order.cpr == fit[i,]$order.cpr)
  model <- lm(relab ~ cpr, data = data)
  fit[i, "r.squared"] <- summary(model)$adj.r.squared
}
fit <- fit %>% slice_max(r.squared, n = 8) %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  mutate(r.squared = signif(.$r.squared, digits = 2)) %>%
  arrange(desc(r.squared)) %>%
  mutate(n = as.character(n))
Expand code
label = c(
  "Dehalococcoidales Magasanikbacterales" = "Dehalococcoidales\n-\nMagasanikbacterales",
  "Gygaellales Portnoybacterales" = "Gygaellales\n-\nPortnoybacterales",
  "Gygaellales Moranbacterales" = "Gygaellales\n-\nMoranbacterales",
  "Dehalococcoidales SG8-24" = "Gygaellales\n-\nSG8-24",
  "Dehalococcoidales BM507" = "2-01-FULL-45-10\n-\nBM507",
  "Pluralincolimonadales Portnoybacterales" = "2-01-FULL-45-10\n-\nPortnoybacterales",
  "Pluralincolimonadales Paceibacterales" = "Gorgyraeales\n-\nPaceibacterales",
  "Gygaellales UBA10092" = "2-01-FULL-45-10\n-\nUBA10092"
)


p2 <- network %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  filter(clade %in% fit$clade) %>%
  mutate(clade = factor(clade, levels = fit$clade)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = fit %>% mutate(clade = factor(clade, levels = clade)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared, " (", n, ")", sep = "")), 
            size = 6/.pt, hjust = 1.0, vjust = -0.0, parse = TRUE, label.size = 0, fill = "white") +  
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~clade, scales = "free", ncol = 4, labeller = as_labeller(label)) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )

p2

Expand code
library(patchwork)

p1 / p2 +
  plot_annotation(tag_levels = c("a", "b")) & theme(
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  axis.ticks.length = unit(0.5, "mm"),
  plot.margin = margin(b = 4),
  plot.tag.position = c(0.1, 0.95)
)

Expand code
ggsave("figures/fig-s4.pdf", width = 140, height = 160, units = "mm")

16S data cultures

NMDS cultures three groundwaters (supplemental)

Expand code
set.seed(999)

nmds <- seqtab %>% # Full size fraction
  inner_join(smd, by = "sample") %>%
  filter(!sample %in% paste("P21015_10", seq(42,81), sep = "")) %>% # Data from bachelor project
  filter(fraction == "non-fractionated" | fraction == "environmental") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1923165 
Run 1 stress 0.2120395 
Run 2 stress 0.1923165 
... Procrustes: rmse 4.071803e-06  max resid 1.411655e-05 
... Similar to previous best
Run 3 stress 0.1923165 
... Procrustes: rmse 4.97049e-06  max resid 1.741608e-05 
... Similar to previous best
Run 4 stress 0.2227196 
Run 5 stress 0.2150739 
Run 6 stress 0.2302776 
Run 7 stress 0.2093082 
Run 8 stress 0.2093082 
Run 9 stress 0.2247649 
Run 10 stress 0.1923165 
... Procrustes: rmse 9.752949e-06  max resid 3.745239e-05 
... Similar to previous best
Run 11 stress 0.2236037 
Run 12 stress 0.2215293 
Run 13 stress 0.2213196 
Run 14 stress 0.2142431 
Run 15 stress 0.1923165 
... Procrustes: rmse 7.963952e-06  max resid 2.772181e-05 
... Similar to previous best
Run 16 stress 0.1923165 
... Procrustes: rmse 1.028341e-05  max resid 4.281696e-05 
... Similar to previous best
Run 17 stress 0.2120395 
Run 18 stress 0.1923165 
... Procrustes: rmse 6.422786e-06  max resid 2.085941e-05 
... Similar to previous best
Run 19 stress 0.2154001 
Run 20 stress 0.1923165 
... New best solution
... Procrustes: rmse 6.318389e-06  max resid 2.826773e-05 
... Similar to previous best
*** Best solution repeated 1 times
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores

Plot the ordination

Expand code
p1 <- nmds.scores %>%
  mutate(medium = if_else(medium == "env","Environmental","Culture")) %>%
  # Plot
  ggplot(aes(x = NMDS1, y = NMDS2
             )) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  # Points for samples, coloured by origin
  geom_point(aes(fill = origin,
                 colour = origin,
                 shape = medium,
                 ), size = 3, stroke = 0.5) +
  theme_tidy() +
  scale_shape_manual(values = c("Culture" = 21, "Environmental" = 23), name = "") +
  scale_fill_manual(values = alpha(cols.origin, alpha = 0.5), guide = "none") +
  scale_colour_manual(values = cols.origin, guide = "none") +
  annotate('text', x = -Inf, y = -Inf, size = 7/.pt, 
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = -0.1, vjust = -1,
           ) +
  ggtitle("All cells") +
  theme(
    plot.title = element_text(hjust = .5, size = 7)
    )
Expand code
nmds <- seqtab %>% # Small size fraction
  inner_join(smd, by = "sample") %>%
  filter(!sample %in% paste("P21015_10", seq(42,81), sep = "")) %>% # Data from bachelor project
  filter(fraction == "0.1 - 0.45 µm" | fraction == "environmental") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1068401 
Run 1 stress 0.2760336 
Run 2 stress 0.10684 
... New best solution
... Procrustes: rmse 3.267484e-05  max resid 9.142731e-05 
... Similar to previous best
Run 3 stress 0.1068401 
... Procrustes: rmse 8.950307e-05  max resid 0.0003677584 
... Similar to previous best
Run 4 stress 0.10684 
... Procrustes: rmse 2.829675e-05  max resid 0.0001164834 
... Similar to previous best
Run 5 stress 0.10684 
... New best solution
... Procrustes: rmse 5.316658e-05  max resid 0.0002191173 
... Similar to previous best
Run 6 stress 0.1068401 
... Procrustes: rmse 0.0001439016  max resid 0.0005893127 
... Similar to previous best
Run 7 stress 0.1068401 
... Procrustes: rmse 0.000100918  max resid 0.0004154544 
... Similar to previous best
Run 8 stress 0.10684 
... Procrustes: rmse 1.307685e-05  max resid 5.27817e-05 
... Similar to previous best
Run 9 stress 0.10684 
... Procrustes: rmse 8.745294e-05  max resid 0.0003599189 
... Similar to previous best
Run 10 stress 0.10684 
... Procrustes: rmse 1.398977e-05  max resid 5.399632e-05 
... Similar to previous best
Run 11 stress 0.1068401 
... Procrustes: rmse 0.000182129  max resid 0.0007494872 
... Similar to previous best
Run 12 stress 0.10684 
... Procrustes: rmse 1.292282e-05  max resid 4.180245e-05 
... Similar to previous best
Run 13 stress 0.1068402 
... Procrustes: rmse 0.0001867016  max resid 0.0007641379 
... Similar to previous best
Run 14 stress 0.10684 
... Procrustes: rmse 6.740886e-05  max resid 0.0002775373 
... Similar to previous best
Run 15 stress 0.10684 
... New best solution
... Procrustes: rmse 1.911565e-05  max resid 7.683967e-05 
... Similar to previous best
Run 16 stress 0.10684 
... Procrustes: rmse 3.965406e-05  max resid 0.0001563627 
... Similar to previous best
Run 17 stress 0.10684 
... Procrustes: rmse 1.868823e-05  max resid 7.486526e-05 
... Similar to previous best
Run 18 stress 0.1068401 
... Procrustes: rmse 0.0001394823  max resid 0.0005685175 
... Similar to previous best
Run 19 stress 0.1068401 
... Procrustes: rmse 0.000156072  max resid 0.0006370961 
... Similar to previous best
Run 20 stress 0.10684 
... Procrustes: rmse 1.551736e-05  max resid 6.113502e-05 
... Similar to previous best
*** Best solution repeated 6 times
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores
Expand code
p2 <- nmds.scores %>%
  mutate(medium = if_else(medium == "env","Environmental","Culture")) %>%
  # Plot
  ggplot(aes(x = NMDS1, y = NMDS2)) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  # Points for samples, coloured by nature
  geom_point(aes(
    colour = origin,
    fill = origin, 
    shape = medium), size = 3, stroke = 0.5) +
  theme_tidy() +
  scale_shape_manual(values = c("Culture" = 21, "Environmental" = 23), name = "") +
  scale_fill_manual(values = alpha(cols.origin, alpha = 0.5), guide = "none") +
  scale_colour_manual(values = cols.origin, name = "Groundwater", labels = c("Meteoric", "Marine", "Saline")) +
  annotate('text', x = -Inf, y = -Inf, size = 7/.pt, 
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = -.1, vjust = -1,
           ) +
  ggtitle("< 0.45 µm fraction") +
  theme(plot.title = element_text(hjust = .5, size = 7),
        axis.title.y = element_blank())
Expand code
library(patchwork)
Expand code
p1 + p2 + plot_layout(guides = "collect") & 
  theme(
    legend.position = "right",
    plot.background = element_blank(),
    legend.text = element_text(margin = margin()),
    legend.box.margin = margin(l = -10),
    plot.margin = margin(l = 4)
    )

Expand code
ggsave("figures/fig-s2.pdf", height = 9, width = 18, units = "cm")

Diversity (alpha + beta) bachelor project (P21015)

Estimate alpha diversity using the Shannon-Weaver index on species level

Expand code
adiv <- seqtab %>%
  select(-relab) %>%
  spread(seqid, count, fill = 0) %>% 
  column_to_rownames('sample') %>%
  vegan::diversity() %>% as.data.frame() %>%
  rownames_to_column('sample') %>%
  rename(shannon = 2) %>%
  inner_join(smd, by = "sample")

adiv %>% filter(sample %in% t0) %>%
  group_by(groundwater) %>% 
  summarise(mean = round(mean(shannon), digits = 2), sd = round(sd(shannon), digits = 2), n = n())
# A tibble: 3 × 4
  groundwater  mean    sd     n
  <chr>       <dbl> <dbl> <int>
1 KR0015       5.66  0.62     6
2 SA1420       3.27  1.07     6
3 SA2600       3.03  0.44     6
Expand code
p1 <- adiv %>%
  filter(fraction != "control") %>%
  mutate(fraction = factor(fraction, levels = c("environmental","fractionated","non-fractionated"))) %>%
  ggplot(aes(
    x = groundwater,
    y = shannon,
    fill = fraction
    )) +
  geom_boxplot( outlier.colour = "white") +
  labs(x = "", y = "Shannon's diversity index (H)", fill = "") +
  theme_tidy(legend = "inside", fontsize = 6) +
  scale_x_discrete(labels = c("Meteoric", "Marine", "Saline")) + 
  scale_fill_manual(values = cols.fraction, labels = c("Environmental", "< 0.45 µm", "All cells")) +
  theme( 
    legend.margin = margin(),
    legend.background = element_rect(fill = "NA"),
    legend.position.inside = c(0.8,0.88),
    legend.text = element_text(margin = margin(l = 1))
  )
Expand code
set.seed(999)

nmds <- seqtab %>%
  filter(sample %in% paste("P21015_10", seq(42,81), sep = "") |
           sample %in% c("P15009_1057","P15009_1059","P15009_1060","P21015_1020","P21015_1021","P21015_1022")
           ) %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>%
  vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1203684 
Run 1 stress 0.1448066 
Run 2 stress 0.1374292 
Run 3 stress 0.1212802 
Run 4 stress 0.1331208 
Run 5 stress 0.1230642 
Run 6 stress 0.13485 
Run 7 stress 0.14842 
Run 8 stress 0.133246 
Run 9 stress 0.1408211 
Run 10 stress 0.1212186 
Run 11 stress 0.1440819 
Run 12 stress 0.1336567 
Run 13 stress 0.1400868 
Run 14 stress 0.1447862 
Run 15 stress 0.1504146 
Run 16 stress 0.1212186 
Run 17 stress 0.1342165 
Run 18 stress 0.1305863 
Run 19 stress 0.133246 
Run 20 stress 0.1476626 
*** Best solution was not repeated -- monoMDS stopping criteria:
    14: stress ratio > sratmax
     6: scale factor of the gradient < sfgrmin
Expand code
nmds.scores <- vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample")
Expand code
# 'i' contains the samples that will be included in panel c
i <- c(
  "P21015_1045", "P21015_1049", "P21015_1053", "P21015_1065", "P21015_1069", "P21015_1077", # Lysate
  "P21015_1043", "P21015_1047", "P21015_1051", "P21015_1055", "P21015_1075", "P21015_1079"  # Acetate
)

i <- smd %>% 
  filter(grepl("P21", sample)) %>%
  filter(fraction == "fractionated") %>% arrange(medium) %>% pull(sample)

p2 <- nmds.scores %>%
  mutate(medium = factor(medium, levels = c("ace","lys","env"))) %>%
  ggplot(aes(x = NMDS1, y = NMDS2, 
             color = fraction, 
             shape = medium,
             fill = fraction
             )
         ) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  geom_point(size = 4.0, stroke = 0.75) + 
  geom_text(data = nmds.scores, 
            aes(label = age), 
            size = 6/.pt, color = "black", na.rm = TRUE) + 
  scale_shape_manual(values = c("ace" = 23, "lys" = 24, "env" = 21), 
                     labels = c("Acetate", "Lysate", "Environmental")) +
  scale_fill_manual(values = alpha(cols.fraction, alpha = 0.2), labels = c("Environmental", "< 0.45 µm", "All cells")) +
  scale_color_manual(values = cols.fraction, labels = c("Environmental", "0.1 - 0.45 µm", "All cells")) +
  annotate('text', x = Inf, y = -Inf, size = 6/.pt, hjust = 1.05, vjust = -0.5, 
           label = paste('Stress = ', round(nmds$stress, digits = 2))) +
  theme_tidy(legend = "bottom", fontsize = 6) +
  theme(
    legend.title = element_blank(),
    legend.box = "vertical",
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    legend.margin = margin(),
    legend.text = element_text(margin = margin(l = 2, r = 8, unit = "pt")),
    legend.key.spacing.x = unit(0, "mm")
    )
Expand code
cols.phylum <- c(
  "Pseudomonadota" = "#972D15",
  "Spirochaetota" = "#D8B70A",
  "Patescibacteria" = "#C7B19C",
  "Nanoarchaeota" = "#046C9A",
  "Omnitrophota" = "#00A08A",
  "Desulfobacterota" = "#A2A475",
  "Acidobacteriota" = "#FAEFD1"
  )  

p3 <- seqtab %>%
  filter(sample %in% i) %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% names(cols.phylum)) %>%
  group_by(sample, phylum) %>% summarise(n = sum(relab) * 100, .groups = "drop") %>%
  # Order the samples and the phyla
  mutate(phylum = factor(phylum, levels = names(cols.phylum))) %>%
  mutate(sample = factor(sample, levels = i)) %>%
  ggplot(aes(
  x = sample,
  y = n,
  fill = fct_rev(phylum)
  )) + geom_col() + scale_fill_manual(values = cols.phylum) + scale_y_continuous(limits = c(NA, 100)) +
  geom_text(data = smd %>% filter(sample %in% i) %>% mutate(sample = factor(sample, i)),
            aes(x = sample, y = 3, label = age), size = 6/.pt, inherit.aes = FALSE
            ) +
  labs(x = "", y = "Relative abundance (%)") + theme_tidy(legend = "bottom", fontsize = 6) +
  guides(fill = guide_legend(ncol = 2, reverse = TRUE)) +
  annotate("segment", x = 0.5, xend = 10.45, y = -2, linewidth = 0.5) + annotate("text", x = 5.5, y = -5, label = "Acetate", size = 6/.pt) +
  annotate("segment", x = 10.55, xend = 20.45, y = -2, linewidth = 0.5) + annotate("text", x = 15.5, y = -5, label = "Lysate", size = 6/.pt) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.title = element_blank(),
    legend.key.width = unit(3.5, "mm"), legend.key.height = unit(4.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.key.spacing.y = unit(0.2, "mm"),
    legend.margin = margin(t = -15)
  )
Expand code
p1 + p2 + p3 + plot_annotation(tag_levels = c("a")) & theme(
  legend.key.size = unit(3, "mm"),
  legend.title = element_blank(),
  #legend.margin = margin(),
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  plot.tag.location = "panel",
  plot.tag.position = c(0.03, 0.97),
  plot.margin = margin(r = 4),
  aspect.ratio = 1
  )

Expand code
ggsave("figures/fig-3.pdf", width = 18, height = 9, units = "cm")

QPCR KR0015 (Fig 2)

Expand code
qpcr <- read_tsv('data/qpcr.tsv', col_types = cols(.default = col_character(), 
                                      quantity = col_double(),
                                      age = col_factor(levels = seq(1:10) %>% as.character())
                                      )
         ) %>% mutate(medium = gsub("ace","srm", medium)) %>%
  rename(kingdom = Kingdom)
Expand code
p1 <- qpcr %>%
  group_by(sample, kingdom) %>% 
  summarise(mean = mean(quantity), sd = sd(quantity), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>%
  mutate(category = paste(medium, fraction)) %>%
  # Call plot
  ggplot(aes(x = fct_relevel(age, seq(1:10) %>% as.character), 
             y = mean, 
             group = category, 
             fill = fraction
             )
         ) +
  geom_line(aes(linetype = medium)) +
  facet_wrap(~ factor(kingdom, levels = c("Bacteria", "Archaea"))) +
  geom_pointrange(aes(ymin = mean - sd, 
                      ymax = mean + sd), 
                  shape = 21, stroke = 0.5) +
  labs(x = 'Week', y = expression("Gene copy number (mL"^"-1"*")")) +
  scale_y_log10(breaks = c(1e3,1e4,1e5,1e6,1e7), 
                labels = c(expression("10"^"3"),
                           expression("10"^"4"),
                           expression("10"^"5"),
                           expression("10"^"6"),
                           expression("10"^"7")
                           )
                ) +
  scale_linetype(name = "Medium:", labels = c("Acetate", "Cell lysate")) +
  scale_fill_manual(values = cols.fraction, 
                    labels = c("< 0.45 µm fraction", "All cells"),
                    name = "") +
  theme_tidy(legend = "right") +
  theme(
    plot.margin = margin(),
    legend.text = element_text(margin = margin(l = 0)),
    axis.ticks.length = unit(0.5, "mm"),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.box.margin = margin(-8),
    )

Microscopy (Fig 2)

Expand code
growth <- read_tsv("data/micro_cultures.tsv", col_types = cols(.default = col_character(), 
                                                  count = col_integer(),
                                                  date = col_date(format = "%d-%m-%Y"))
                   ) %>%
  mutate(week = case_when(
    date == "2021-10-15" ~  1,
    date == "2021-11-11" ~  5,
    date == "2021-11-25" ~  7,
    date == "2021-12-09" ~  9,
    date == "2021-12-30" ~ 12,
    date == "2022-01-18" ~ 15,
    date == "2022-02-03" ~ 17
  )) %>%
  filter(!sample %in% c("P26201_1051","P26201_1024")) %>%
  mutate(label = case_when(
    groundwater == "KR0015" ~  "Meteoric",
    groundwater == "SA1420" ~  "Marine",
    groundwater == "SA2600" ~  "Saline"
  ))

counts <- tibble(
  label = c("Meteoric","Marine","Saline"),
  min = c(680220, 59220, 14100),
  max = c(890910, 73910, 23000)
  )
Expand code
# Microscopy counts on KR0015 groundwater in duplicates, obtaining 680,220 and 890,910 cells ml-1.
# Microscopy counts on SA1420 groundwater in duplicates, obtaining 59220 and 73910 cells ml-1.
# For SA2600, this number was 14,100 and 23,000 cells ml-1

p2 <- growth %>%
  ggplot(aes(x = week, y = count, fill = fraction)) +
  geom_rect(data = counts, aes(xmin = 1, xmax = 17, ymin = min, ymax = max), fill = "#8D8680", alpha = 0.5, inherit.aes = FALSE) +
  geom_line(aes(linetype = medium)) + 
  geom_point(aes(fill = fraction), shape = 21, stroke = 0.5, size = 2) +
  scale_y_log10(limits = c(1e4,1e8),
                breaks = c(1e4,1e5,1e6,1e7),
                labels = c(expression("10"^"4"),
                           expression("10"^"5"),
                           expression("10"^"6"),
                           expression("10"^"7"))
                ) +
  scale_x_continuous(breaks = c(1,5,7,9,12,15,17), 
                     labels = c("1", "5", "", "9", "12", "", "17")) +
  scale_fill_manual(values = cols.fraction, guide = "none") +
  scale_linetype_manual(values = c("srm" = 1, "lys" = 2), name = "Medium:", 
                 labels = c("Acetate", "Cell lysate"), 
                 guide = "none") +
  facet_wrap(~ factor(label, levels = c("Meteoric", "Marine", "Saline")), ncol = 3) +
  labs(x = "Week", y = expression("Cell number (mL"^"-1"*")"), colour = "Fraction:") +
  theme_tidy(legend = "bottom") + 
  theme(
    legend.margin = margin(t = -10, l = 20),
    plot.margin = margin(),
    panel.border = element_rect(fill = "transparent", linewidth = 0.5)
    )
Expand code
library(patchwork)
Expand code
p1 + p2 + plot_layout(nrow = 2, heights = c(3,2), guides = "collect") + plot_annotation(tag_levels = "a") & theme(
  plot.margin = margin(b = 6),
  legend.position = "bottom",
  plot.tag.location = "plot",
  plot.tag.position = c(0.09, 0.96)
  )

Expand code
ggsave(filename = "figures/fig-2.pdf", width = 14, height = 14, units = "cm")

Barplot KR0015

Expand code
t <- seqtab %>%
  filter(sample %in% paste("P21015", seq(1042,1081), sep = "_")
           ) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(9, mean_relab) %>% filter(phylum != "Bacteroidota")

# The most abundant phyla, averaged over all cultures
t %>% arrange(desc(mean_relab))
# A tibble: 8 × 2
  phylum           mean_relab
  <chr>                 <dbl>
1 Pseudomonadota       10.8  
2 Acidobacteriota       7.90 
3 Spirochaetota         6.28 
4 Patescibacteria       6.16 
5 Desulfobacterota      3.22 
6 Bacillota             1.54 
7 Nanoarchaeota         0.967
8 Omnitrophota          0.853
Expand code
taxref <- tax %>%
  left_join(t %>% transmute(phylum, topphylum = phylum), by = "phylum") %>%
  replace_na(list("topphylum" = "Other"))

cols.phylum <- c(
  "Pseudomonadota" = "#972D15",
  "Acidobacteriota" = "#FAEFD1",
  "Spirochaetota" = "#D8B70A",
  "Patescibacteria" = "#C7B19C",
  "Desulfobacterota" = "#A2A475",
  "Bacillota" = "#1B5331",
  "Nanoarchaeota" = "#046C9A",
  "Omnitrophota" = "#00A08A",
  "Other" = "#899DA4"
)
Expand code
seqtab %>%
  filter(sample %in% paste("P21015", seq(1042,1081), sep = "_")
           ) %>%
  inner_join(taxref, by = "seqid") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, topphylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>%
  mutate(age = as.numeric(.$age)) %>% 
  mutate(fraction = case_when(
    fraction == "non-fractionated" & medium == "ace" ~ "Acetate, all cells",
    fraction == "non-fractionated" & medium == "lys" ~ "Lysate, all cells",
    fraction == "fractionated" & medium == "ace" ~ "Acetate, < 0.45 µm fraction",
    fraction == "fractionated" & medium == "lys" ~ "Lysate, < 0.45 µm fraction"
  )) %>%
  mutate(topphylum = factor(topphylum, levels = rev(names(cols.phylum)))) %>%
  # Call the plot
  ggplot(aes(
    age, 
    relab * 100, 
    fill = topphylum)) +
  geom_col() + 
  scale_x_continuous(n.breaks = 10) +
  facet_wrap(~ fraction, nrow = 1) +
  labs(x = 'Week', y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum) + 
  guides(fill = guide_legend(reverse = TRUE, ncol = 3)) +
  theme_tidy(legend = "bottom") + 
  theme(
    aspect.ratio = 1.2,
    panel.border = element_rect(fill = "transparent", linewidth = 0.5),
    strip.background = element_blank(), 
    legend.background = element_blank(),
    legend.key.height = unit(4, "mm"),
    legend.key.spacing.y = unit(0, "mm"),
    plot.margin = margin(),
    axis.ticks.length = unit(0.5, "mm"),
    legend.box.margin = margin(t = -5)
    )

Expand code
ggsave("figures/fig-s3.pdf", height = 9, width = 18, units = "cm")

Barplot KR0015 - SA1420 - SA2600

Expand code
t <- seqtab %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "KR0015") %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  top_n(8, mean_relab) %>% arrange(desc(mean_relab)) %>% pull(phylum)
Expand code
cols.phylum <- c(
  "Pseudomonadota" = "#972D15",
  "Bacillota" = "#1B5331",
  "Bacteroidota" = "#D3D4D8",
  "Desulfobacterota" = "#A2A475",
  "Patescibacteria" = "#C7B19C",
  "Omnitrophota" = "#00A08A",
  "Spirochaetota" = "#D8B70A",
  "Campylobacterota" = "#79402E",
  "Acidobacteriota" = "#FAEFD1",
  "Nanoarchaeota" = "#046C9A",
  "Actinomycetota" = "#C6CDF7",
  "Verrucomicrobiota" = "#D69C4E",
  "Other" = "#899DA4"
)
Expand code
p1 <- seqtab %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>% 
  mutate(phylum = if_else(phylum %in% t, phylum, "Other")) %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "KR0015") %>%
  mutate(phylum = factor(phylum, levels = names(rev(cols.phylum)))) %>%
  mutate(fraction = case_when(
    fraction == "fractionated" & medium == "lys" ~ "Fractionated: lysate",
    fraction == "fractionated" & medium == "ace" ~ "Fractionated: acetate",
    fraction == "non-fractionated" & medium == "lys" ~ "All cells: lysate",
    fraction == "non-fractionated" & medium == "ace" ~ "All cells: acetate",
  )) %>%
   # Call the plot
  ggplot(aes(
    sample, 
    relab * 100, 
    fill = phylum)) +
  geom_col(show.legend = TRUE) + 
  facet_wrap(~ fraction, scales = "free_x", nrow = 1) +
  labs(x = "", y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum, drop = FALSE) + 
  theme_tidy(legend = "bottom") + guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10)
  )
Expand code
t <- seqtab %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA1420") %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  top_n(8, mean_relab) %>% arrange(desc(mean_relab)) %>% pull(phylum)
Expand code
p2 <- seqtab %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>% 
  mutate(phylum = if_else(phylum %in% t, phylum, "Other")) %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA1420") %>%
  mutate(phylum = factor(phylum, levels = names(rev(cols.phylum)))) %>%
  mutate(fraction = case_when(
    fraction == "fractionated" & medium == "lys" ~ "Fractionated: lysate",
    fraction == "fractionated" & medium == "ace" ~ "Fractionated: acetate",
    fraction == "non-fractionated" & medium == "lys" ~ "All cells: lysate",
    fraction == "non-fractionated" & medium == "ace" ~ "All cells: acetate",
  )) %>%
   # Call the plot
  ggplot(aes(
    sample, 
    relab * 100, 
    fill = phylum)) +
  geom_col(show.legend = TRUE) + 
  facet_wrap(~ fraction, scales = "free_x", nrow = 1) +
  labs(x = "", y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum, drop = FALSE) + 
  theme_tidy(legend = "bottom") + guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10)
  )
Expand code
t <- seqtab %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA2600") %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  top_n(8, mean_relab) %>% arrange(desc(mean_relab)) %>% pull(phylum)
Expand code
p3 <- seqtab %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>% 
  mutate(phylum = if_else(phylum %in% t, phylum, "Other")) %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA2600") %>%
  mutate(phylum = factor(phylum, levels = names(rev(cols.phylum)))) %>%
  mutate(fraction = case_when(
    fraction == "fractionated" & medium == "lys" ~ "Fractionated: lysate",
    fraction == "fractionated" & medium == "ace" ~ "Fractionated: acetate",
    fraction == "non-fractionated" & medium == "lys" ~ "All cells: lysate",
    fraction == "non-fractionated" & medium == "ace" ~ "All cells: acetate",
  )) %>%
   # Call the plot
  ggplot(aes(
    sample, 
    relab * 100, 
    fill = phylum)) +
  geom_col(show.legend = TRUE) + 
  facet_wrap(~ fraction, scales = "free_x", nrow = 1) +
  labs(x = "", y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum, drop = FALSE) + 
  theme_tidy(legend = "bottom") + guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10)
  )
Expand code
p1 + p2 + p3 + plot_layout(nrow = 3, ncol = 1, guides = "collect", widths = c(2,2,1)) + 
  plot_annotation(tag_levels = "a") & theme(
  axis.ticks.x = element_blank(),
  axis.text.x = element_blank(),
  axis.title.x = element_blank(),
  plot.tag.location = "panel",
  plot.tag.position = c(-0.05, 1.06),
  legend.position = "bottom",
  legend.box.margin = margin(t = 10),
  aspect.ratio = 1
)

Expand code
ggsave(filename = "oversight.pdf", width = 180, height = 180, units = "mm")

Genome-resolved metagenomics

Basic stats on the MAGs

Expand code
t <- c("Bacillota","Desulfobacterota","Pseudomonadota","Patescibacteria","Nanoarchaeota","Other")

# Prepare data for plot
i <- quality %>%
  inner_join(bintax, by = "genome") %>%
  mutate(phylum = ifelse(phylum %in% t, phylum, "Other")) %>%
  mutate(phylum = factor(phylum, levels = t)) %>%
  select(genome, phylum, completeness, gc_content, genome_size, coding_density) %>%
  distinct() %>%
  mutate(genome.size = genome_size / 1e6)
Expand code
lm(gc_content ~ genome_size, data = quality) %>% summary()

Call:
lm(formula = gc_content ~ genome_size, data = quality)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.15655 -0.06232 -0.02604  0.07979  0.13851 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.058e-01  3.670e-02   8.332 1.26e-09 ***
genome_size 5.080e-08  1.029e-08   4.935 2.23e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.09515 on 33 degrees of freedom
Multiple R-squared:  0.4246,    Adjusted R-squared:  0.4072 
F-statistic: 24.36 on 1 and 33 DF,  p-value: 2.233e-05
Expand code
p1 <- i %>%
  ggplot(aes(
    genome.size, 
    gc_content, 
    color = phylum,
    fill = phylum
    )) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#d9d9d9", se = TRUE, linewidth = 0.4) +
  geom_point(size = 2.5, shape = 21, stroke = 0.5, key_glyph = "rect") + 
  labs(x = "Estimated genome size (Mb)", y = "Estimated GC content (%)", color = "") +
  scale_colour_manual(values = cols.mag, guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5)) +
  theme_tidy() + 
  scale_x_continuous(breaks = c(1,2,3,4,5,6)) +
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2, \" = 0.41\")", 
           size = 7/.pt, vjust = -0.5, hjust = 1, parse = TRUE) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank()
  )
Expand code
p2 <- i %>%
  mutate(phylum = factor(phylum, levels = c(
    "Nanoarchaeota", "Patescibacteria", "Other", "Bacillota", "Desulfobacterota", "Pseudomonadota"
  ))) %>%
  ggplot(aes(
    phylum, 
    completeness, 
    group = phylum, 
    fill = phylum, 
    color = phylum
    )) +
  geom_boxplot(linewidth = 0.5, outlier.colour = "white") +
  geom_jitter(width = 0.2, size = 2, shape = 21, stroke = 0.5) +
  labs(x = "", y = "Estimated completeness (%)", fill = "") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") + 
  scale_color_manual(values = cols.mag, guide = "none") +
  theme_tidy(legend = "bottom") +
  theme(
    axis.line = element_line(),
    panel.border = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_text(margin = margin()),
    plot.margin = margin()
  )
Expand code
lm(coding_density ~ genome_size, data = quality) %>% summary()

Call:
lm(formula = coding_density ~ genome_size, data = quality)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.05556 -0.01303 -0.00046  0.01275  0.03787 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.172e-01  8.112e-03 113.064  < 2e-16 ***
genome_size -6.991e-09  2.275e-09  -3.072  0.00424 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02103 on 33 degrees of freedom
Multiple R-squared:  0.2224,    Adjusted R-squared:  0.1989 
F-statistic: 9.439 on 1 and 33 DF,  p-value: 0.004236
Expand code
p3 <- i %>%
  ggplot(aes(
    genome.size, 
    coding_density, 
    color = phylum,
    fill = phylum
    )) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#d9d9d9", se = TRUE, linewidth = 0.4) +
  geom_point(size = 2.5, shape = 21, stroke = 0.5) + 
  labs(x = "Estimated genome size (Mb)", y = "Estimated coding density", color = "") +
  scale_colour_manual(values = cols.mag, guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") +
  theme_tidy() + 
  scale_x_continuous(breaks = c(1,2,3,4,5,6)) +
  scale_y_continuous(n.breaks = 7) +
  annotate("text", x = Inf, y = Inf, 
           label = "paste(R ^ 2, \" = 0.20\")", 
           size = 7/.pt, vjust = 1, hjust = 1, parse = TRUE) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank()
  )
Expand code
nmds <- coverm %>%
  # Sum the abundance of both coverage per metagenome
  mutate(abundance = 1) %>% 
  inner_join(emapper, by = "genome", relationship = "many-to-many") %>%
  # Filter out NA and add multiple KOs on new line
  filter(kegg_ko != "-") %>% separate_rows(kegg_ko, sep = ",") %>%
  # Sum the abundance of each KO within the metagenomes
  group_by(genome, kegg_ko) %>% 
  summarise(abundance = sum(abundance), .groups = "drop") %>%
  # Pivot to prepare a numerical matrix
  pivot_wider(names_from = "kegg_ko", values_from = "abundance", values_fill = 0) %>%
  column_to_rownames("genome") %>%
  vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.06277184 
Run 1 stress 0.06660901 
Run 2 stress 0.06506398 
Run 3 stress 0.06263043 
... New best solution
... Procrustes: rmse 0.002502074  max resid 0.01128171 
Run 4 stress 0.0661089 
Run 5 stress 0.07059065 
Run 6 stress 0.06647542 
Run 7 stress 0.06601262 
Run 8 stress 0.06459214 
Run 9 stress 0.06898965 
Run 10 stress 0.0650395 
Run 11 stress 0.06277191 
... Procrustes: rmse 0.002517934  max resid 0.01114544 
Run 12 stress 0.06539869 
Run 13 stress 0.06277223 
... Procrustes: rmse 0.002536871  max resid 0.01104603 
Run 14 stress 0.06277183 
... Procrustes: rmse 0.002499626  max resid 0.01127226 
Run 15 stress 0.06263054 
... Procrustes: rmse 7.567288e-05  max resid 0.0002464256 
... Similar to previous best
Run 16 stress 0.06660889 
Run 17 stress 0.06360088 
Run 18 stress 0.06299217 
... Procrustes: rmse 0.0781651  max resid 0.1490143 
Run 19 stress 0.06286022 
... Procrustes: rmse 0.0781924  max resid 0.1486652 
Run 20 stress 0.0660097 
*** Best solution repeated 1 times
Expand code
nmds.scores <- vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("genome")
Expand code
p4 <- nmds.scores %>%
  inner_join(bintax, by = "genome") %>%
  mutate(phylum = ifelse(phylum %in% t, phylum, "Other")) %>%
  # Plot
  ggplot(aes(x = NMDS1 * -1, 
             y = NMDS2, 
             colour = fct_relevel(phylum,t),
             fill = phylum
             )) +
  geom_vline(xintercept = 0, linetype = 'dotted', linewidth = 0.5) +
  geom_hline(yintercept = 0, linetype = 'dotted', linewidth = 0.5) +
  # Points for samples, coloured by nature
  geom_point(size = 3, shape = 21, stroke = 0.5) +
  theme_tidy() + labs(x = "NMDS1", y = "NMDS2") +
  scale_color_manual(values = cols.mag, name = "Phylum", guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), name = "", guide = "none") +
  scale_y_continuous(breaks = c(-0.5, 0, 0.5)) + 
  annotate('text', x = -Inf, y = -Inf, size = 7/.pt,
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = -0.1, vjust = -0.5) +
  theme(
    axis.title.y = element_text(margin = margin()),
    legend.spacing.x = unit(-0.5, "mm")
    )
Expand code
library(patchwork)

p1 + p2 + p3 + p4 + plot_layout(guides = "collect", nrow = 2) & plot_annotation(tag_levels = c("a", "b", "c", "d")) &
  theme(
    plot.margin = margin(),
    plot.tag.location = "panel",
    plot.tag.position = c(0.04, 0.95),
    legend.position = "bottom",
    axis.ticks.length = unit(".75", "mm"),
    legend.box.margin = margin(),
    legend.text = element_text(hjust = 0),
    legend.key.size = unit("3", "mm"),
    legend.title = element_blank()
    )

Expand code
ggsave(filename = "figures/fig-4.pdf", width = 12, height = 12, units = "cm")

Metabolic weight

Expand code
samples <- coverm$sample %>% unique()

mw <- read_tsv("data/metabolic.tsv.gz", show_col_types = FALSE)
Expand code
i <- list(sample = c("VK-3516-l3u_S16", "VK-3516-l3u_S16"),
          genome = c("MEGAHIT-MaxBin2Refined-VK-3516-KFM02a-3_S40_L002.026", "MEGAHIT-MetaBAT2Refined-VK-3516-KFM02a-3_S40_L002.49_sub"),
          pathway = c("Organic carbon oxidation - complex carbon degradation", "Organic carbon oxidation - complex carbon degradation"), 
          mw.function = c(0, 0), 
          mw = c(0, 0))

mw <- mw %>% 
  # Some MAGs do not have any metabolic weight: assign a dummy
  rbind(i) %>%
  mutate(label = case_when(
  pathway == "Organic carbon oxidation - complex carbon degradation" ~ "01: Organic C oxidation: complex C",
  pathway == "Organic carbon oxidation - amino acid utilization" ~ "02: Organic C oxidation: AA utilization",
  pathway == "Organic carbon oxidation - aromatics degradation" ~ "03: Organic C oxidation: aromatics",
  pathway == "Organic carbon oxidation - methanol oxidation" ~ "04: Organic carbon oxidation: methanol oxidation",
  pathway == "Organic carbon oxidation - fatty acid degradation" ~ "05: Organic C oxidation: fatty acid degradation",
  pathway == "Carbon fixation - CBB cycle (Rubisco)" ~ "06: C fixation: CBB cycle",
  pathway == "Carbon fixation - Reverse TCA cycle" ~ "07: C fixation: Reverse TCA",
  pathway == "Carbon fixation - 3HP/4HB" ~ "08: C fixation: 3HP/4HB",
  pathway == "Carbon fixation - Wood-Ljungdahl pathway" ~ "09: C fixation: Wood-Ljungdahl",
  pathway == "Acetate oxidation" ~ "10: Acetate oxidation",
  pathway == "Ethanol oxidation" ~ "11: Ethanol oxidation",
  pathway == "Fermentation" ~ "12: Fermentation",
  pathway == "Hydrogen generation" ~ "13: Hydrogen generation",
  pathway == "Hydrogen oxidation" ~ "14: Hydrogen oxidation",
  pathway == "N2 fixation - nifDK||vnfDKG||nifH" ~ "15: N fixation: nifDK, nifH",
  pathway == "Nitrate reduction - napAB" ~ "16: Nitrate reduction: napAB",
  pathway == "Nitrite ammonification - nirBD" ~ "17: Nitrite ammonification: nirBD",
  pathway == "Nitrite ammonification - nrfADH" ~ "18: Nitrite ammonification: nrfADH",
  pathway == "Sulfur oxidation - sdo" ~ "19: Sulfur oxidation: sdo",
  pathway == "Sulfate reduction" ~ "20: Sulfate reduction",
  pathway == "Sulfide oxidation - sqr" ~ "21: Sulfide oxidation: sqr",
  pathway == "Iron oxidation:" ~ "22: Iron oxidation",
  pathway == "Iron reduction" ~ "23: Iron reduction",
  .default = "24: Other"
)) %>%
  mutate(order = substr(label, 1, 2)) %>%
  mutate(label = substr(label, 5, nchar(.))) %>%
  arrange(order)   


cols.phylum <- c(
  "Desulfobacterota" = "#A2A475",
  "Pseudomonadota" = "#972D15",
  "Bacillota" = "#1B5331",
  "Patescibacteria" = "#C7B19C",
  "Nanoarchaeota" = "#00A08A",  
  "Chloroflexota" = "#D69C4E",
  "Spirochaetota" = "#D8B70A",
  "Campylobacterota" = "#FAEFD1",
  "Iainarchaeota" = "#046C9A"
)
Expand code
name <- bintax %>%
  filter(phylum %in% names(cols.phylum)) %>%
  mutate(phylum = factor(phylum, levels = names(cols.phylum))) %>%
  arrange(phylum) %>%
  mutate(genus = if_else(
    genome == "MEGAHIT-MaxBin2Refined-VK-3516-KFM02a-3_S40_L002.019_sub", 
    true = paste(family, "_2", sep = ""), false = genus
    )) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  pull(taxon) %>% unique()

p1 <- bintax %>%
  # Use subset of bins
  filter(phylum %in% names(cols.phylum)) %>%
  mutate(taxonomy = "Taxonomy") %>%
  mutate(genus = if_else(
    genome == "MEGAHIT-MaxBin2Refined-VK-3516-KFM02a-3_S40_L002.019_sub", 
    true = paste(family, "_2", sep = ""), false = genus
    )) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  # Plot
  ggplot(aes(fct_relevel(taxon, name), 
             taxonomy, 
             fill = fct_relevel(phylum, names(cols.phylum))
             )) +
  geom_tile(colour = "white") +
  labs(x = "", y = "", fill = "") +
  scale_fill_manual(values = cols.phylum, guide = "none") +
  scale_x_discrete(position = "top") + 
  theme_tidy() +
  coord_equal() +
  theme(
    legend.position = "left",
    legend.key.height = unit(3, "mm"),
    legend.key.width = unit(3, "mm"),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.text.y = element_blank(),
    panel.border = element_rect(linewidth = 1,fill = NA)
  )
Expand code
i <- mw %>%
  filter(label != "Other") %>%
  pull(label) %>% unique()

p2 <- mw %>%
  group_by(genome, label) %>% summarise(mw = mean(mw), .groups = "drop") %>%
  inner_join(bintax, by = "genome") %>% filter(phylum %in% names(cols.phylum)) %>%
  mutate(genus = if_else(
    genome == "MEGAHIT-MaxBin2Refined-VK-3516-KFM02a-3_S40_L002.019_sub", 
    true = paste(family, "_2", sep = ""), false = genus
    )) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  # Order the genomes
  mutate(taxon = factor(taxon, name)) %>%
  # Order the pathways
  mutate(label = factor(label, levels = rev(i))) %>%
  # Filter out pathways
  filter(!is.na(label)) %>%
  # Plot
  ggplot(aes(
    taxon, 
    label
    )) +
  geom_point(aes(size = mw), shape = 21, stroke = 0.5, fill = "white") +
  theme_tidy() +
  geom_vline(xintercept = 7.5, linewidth = 0.2) +
  geom_vline(xintercept = 13.5, linewidth = 0.2) +
  geom_vline(xintercept = 19.5, linewidth = 0.2) +
  geom_vline(xintercept = 22.5, linewidth = 0.2) +
  geom_vline(xintercept = 24.5, linewidth = 0.2) +
  labs(x = "", y = "", size = "Metabolic\nweight score") +
  theme(
    legend.position = "bottom",
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    panel.border = element_rect(linewidth = 0.75, fill = NA),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(),
    panel.grid.major = element_blank()
  )
Expand code
library(patchwork)

p1 / p2 & theme(
  legend.box.margin = margin(l = -8),
  plot.margin = margin(b = 2),
  legend.key.width = unit(2, "mm"),
  legend.text = element_text(margin = margin(l = 2))
  )

Expand code
ggsave("figures/fig-x.pdf", width = 18, height = 18, units = "cm")